In [36]:
from __future__ import print_function
import numpy as np
import pandas as pd
from collections import OrderedDict #sorting participant df dict before pd.concat()
import matplotlib.pylab as plt
%matplotlib inline
pd.options.display.mpl_style = 'default'

# Participants that are excluded from all performance analysis
non_english_fluent  = ['023', '031', '045', '050', '070', '106',]
left_handed = ['042', '088',]

excluded_all_tasks = non_english_fluent + left_handed

# Pilot subjects
excluded_all_tasks += ['010', '011', '012', '013', '014']

In [37]:
def isolate_isip_task(taskname): 
    #requires external dbase variable
    db_to_isip = dbase.swaplevel(1,2).swaplevel(0,1).xs('tap_r', drop_level=True)
    del db_to_isip['channel']
    del db_to_isip['pitch']
    #channel==1 and pitch==48 verified for all tap_r data points
    db_allsubs = db_to_isip.xs(taskname, drop_level=True)
    return db_allsubs

In [38]:
def run_isip_grouping_filter(df,
                             minimum_interval,
                             #filter_stdev_radius,
                             maximum_interval,
                             start_recording):

    grouped = df.groupby(level='pid') #, 'stamp_type'])  

    def filter_intervals(df):
        filt1 = df.copy()
        # remove timestamps prior to the end of stimuli    
        filt1 = filt1.loc[filt1.task_ms >= start_recording]    
        # remove timestamps that end an undersized interval
        filt1 = filt1.loc[  (filt1.int_raw >= minimum_interval) 
                          | (filt1.int_raw.isnull()) ]    
                
        # special case: p048 @800ms, outlier short intervals 
        # among >1000 average interval
        # (removed, not needed: p048 was left-handed, eliminated)
        #if filt1.int_raw.median() > 1000:
        #    print(filt1.int_raw.count())
        #    print("applied")
        #    filt1 = filt1.loc[  (filt1.int_raw >= 650) 
        #                      | (filt1.int_raw.isnull()) ]    
        #    print(filt1.int_raw.count())

        # special case: p049 @500ms, outlier short intervals 
        # among >1000 average interval
        if filt1.int_raw.median() < 375:
            print(filt1.int_raw.count())
            print("applied")
            filt1 = filt1.loc[  (filt1.int_raw <= 625) 
                              | (filt1.int_raw.isnull()) ]    
            print(filt1.int_raw.count())
        # end special cases
        
        # interval recalculation now skips the too-soon taps.
        int_filt1 = filt1.task_ms - filt1.task_ms.shift(1)
        int_max_exceeded = int_filt1 > maximum_interval
        
        # df.int_filt1 is our nearly-final intervals sequence: but has NaNs where 
        # intervals were combined, and contains overlong intervals for now.
    
        # Make a pair of series containing the same indexes: only those 
        # that weren't removed due to too-SHORT intervals. (those we combined
        # together because they were too short are 'legitimate' and included
        # as an interval in the lagging. Those we excluded because they were
        # overlong disrupt the lagged-comparison process and should trigger
        # an exclusion of data from the lagged deviation calculation.)
        # Or more succinctly: avoid calculating a lag-2 dev before and after 
        # a skipped, overlong interval.
        intsequence = int_filt1[int_filt1.notnull()]
        maxex_sequence = int_max_exceeded[int_filt1.notnull()]
        intsequence[maxex_sequence == True] = np.nan
        
        df['ints_filtered'] = intsequence
        
        def skip_missing_lagvalues(series, lag_values, min_values='all'):
            for n in lag_values:
                series = series[maxex_sequence.shift(n) != True]
            return series
        
        lag2dev = intsequence - intsequence.shift(2)
        lag2dev = skip_missing_lagvalues(lag2dev, [0, 1, 2])
        df['lag2dev'] = lag2dev
        df['lag2devsq'] = lag2dev ** 2
        
        # rolliing_mean(): "By default, the result is set to the right edge of the 
        #                   window. This can be changed to the center of the window 
        #                   by setting center=True."
        # shift(1) sets the result series one ahead of the end of the window, so that 
        # each value is compared with the mean of the N intervals preceding it.
        
        movingmean_prev2 = pd.rolling_mean(intsequence.shift(1), window=2)
        movingmean_prev2 = skip_missing_lagvalues(movingmean_prev2, [1, 2])
        df['movingmean_prev2'] = movingmean_prev2
        
        lagdev_avgprev2 = intsequence - pd.rolling_mean(intsequence.shift(1), window=2)
        lagdev_avgprev2 = skip_missing_lagvalues(lagdev_avgprev2, [0, 1, 2])
        df['lagdev_avgprev2'] = lagdev_avgprev2
        df['lagdev_avgprev2sq'] = lagdev_avgprev2 ** 2
        
        movingmean_prev3 = pd.rolling_mean(intsequence.shift(1), window=3)
        movingmean_prev3 = skip_missing_lagvalues(movingmean_prev3, [1, 2, 3])
        df['movingmean_prev3'] = movingmean_prev3
        
        lagdev_avgprev3 = intsequence - pd.rolling_mean(intsequence.shift(1), window=3)
        #lagdev_avgprev3 = skip_missing_lagvalues(lagdev_avgprev3, [0, 1, 2, 3])
        df['lagdev_avgprev3'] = lagdev_avgprev3
        df['lagdev_avgprev3sq'] = lagdev_avgprev3 ** 2
        
        movingmean_prev4 = pd.rolling_mean(intsequence.shift(1), window=4)
        movingmean_prev4 = skip_missing_lagvalues(movingmean_prev4, [1, 2, 3, 4])
        df['movingmean_prev4'] = movingmean_prev4
        
        lagdev_avgprev4 = intsequence - pd.rolling_mean(intsequence.shift(1), window=4)
        lagdev_avgprev4 = skip_missing_lagvalues(lagdev_avgprev4, [0, 1, 2, 3, 4])
        df['lagdev_avgprev4'] = lagdev_avgprev4
        df['lagdev_avgprev4sq'] = lagdev_avgprev4 ** 2

        df['LD_AP4_MMskipmis_var'] = movingmean_prev4.var(ddof=1)
        df['LD_AP4_MMskipmis_std'] = movingmean_prev4.std(ddof=1)
        df['LD_AP4_MMskipmis_len'] = movingmean_prev4.count()
        temp_noskip_4 = pd.rolling_mean(intsequence.shift(1), window=4)
        df['LD_AP4_MMkeepmis_var'] = temp_noskip_4.var(ddof=1)
        df['LD_AP4_MMkeepmis_std'] = temp_noskip_4.std(ddof=1)
        df['LD_AP4_MMkeepmis_len'] = temp_noskip_4.count()
                
        movingmean_prev_12 = pd.rolling_mean(intsequence.shift(1), window=12)
        #movingmean_prev_12 = skip_missing_lagvalues(movingmean_prev_12, range(1, 12 + 1))
        df['movingmean_prev_12'] = movingmean_prev_12
        lagdev_avgprev_12 = intsequence - pd.rolling_mean(intsequence.shift(1), window=12)
        #lagdev_avgprev_12 = skip_missing_lagvalues(lagdev_avgprev_12, range(0, 12 + 1))
        df['lagdev_avgprev_12'] = lagdev_avgprev_12
        df['lagdev_avgprev_12sq'] = lagdev_avgprev_12 ** 2
        
        df['int_filt1'] = int_filt1
        df['int_max_exceeded'] = int_max_exceeded
        df['ints'] = int_filt1.loc[~int_max_exceeded]
        return df
    
    df = grouped.apply(filter_intervals)
    #print(df)
    return df

(Image from a related article. "34" is inapplicable for the current dataset.)


In [39]:
def isip_outcomes_taskdf(isip_db, 
                         squared_local_dev_measure='lag2devsq'):
    pid_list = sorted(list(isip_db.index.get_level_values('pid').unique()))
    
    df = pd.DataFrame(index=pid_list)
    df.index.names = ['pid']
    
    ints = isip_db.ints.groupby(level='pid')
    ints_count = ints.apply(lambda s: s.count()) #count() ignores nulls
    ints_mean = ints.apply(lambda s: s.mean())
    ints_variance = ints.apply(lambda s: s.var(ddof=1))
    ints_stdev = ints.apply(lambda s: s.std(ddof=1))
    
    ints_lag2corr = ints.corr(ints.shift(2))
    
    lagdevsq_series = isip_db[squared_local_dev_measure]
    lagdevsq = lagdevsq_series.groupby(level='pid')    
    lagdevsq_count = lagdevsq.apply(lambda s: s.count()) # (N - 2)    
    lagdevsq_mean = lagdevsq.apply(lambda s: s.mean())    
    
    #Sum: ((X sub i + 2) - (x sub i)) ^ 2
    lagdevsq_sum = lagdevsq.apply(lambda s: s.sum())
        
    #because of problem below, might need to change the lag2devsq_count variable
    # to be the overall count (ints_count) minus one.
    local_sq_abs = lagdevsq_sum / (2. * lagdevsq_count)
    local = 100 * (1. / ints_mean) * np.sqrt(local_sq_abs)
    
    #PROBLEM: total variance uses all the intervals. local_sq_abs removes data points
    #when they aren't sequential...
    drift = 100 * ((1. / ints_mean) * np.sqrt(ints_variance - local_sq_abs))
    
    df['ints_count'] = ints_count
    df['ints_mean'] = ints_mean
    df['ints_variance'] = ints_variance
    df['ints_stdev'] = ints_stdev
    df['ints_lag2corr'] = ints_lag2corr
    #df['devsq_sum'] = lagdevsq_sum
    #df['devsq_count'] = lagdevsq_count    
    #df['devsq_mean'] = lagdevsq_mean
    #df['local_sq_abs'] = local_sq_abs
    #df['local'] = local
    #df['drift'] = drift
    
    df[squared_local_dev_measure + '_sum'] = lagdevsq_sum
    df[squared_local_dev_measure + '_count'] = lagdevsq_count    
    df[squared_local_dev_measure + '_mean'] = lagdevsq_mean
    df[squared_local_dev_measure + '_local_sq_abs'] = local_sq_abs
    df[squared_local_dev_measure + '_local'] = local
    df[squared_local_dev_measure + '_drift'] = drift
    
    return df

Params / unpickle dbase


In [40]:
#recording of intervals starts (ISI x 5) after stims end (2.5s, 4.0s)

ISIP_5_ENDSTIMS_MS = 19500
ISIP_5_WAIT_AFTER_STIMS_MS = 2500
ISIP_5_MINIMUM_INT = 375
ISIP_5_MAXIMUM_INT = 650
isip_5_start_recording = ISIP_5_ENDSTIMS_MS + ISIP_5_WAIT_AFTER_STIMS_MS
#ISIP_500_DISQUALIFIED = ['045', #did not complete task correctly
#                         '042', '048' #left-handed
#                         ] 

ISIP_8_ENDSTIMS_MS = 23200
ISIP_8_WAIT_AFTER_STIMS_MS = 4000
ISIP_8_MINIMUM_INT = 600
ISIP_8_MAXIMUM_INT = 1000
isip_8_start_recording = ISIP_8_ENDSTIMS_MS + ISIP_8_WAIT_AFTER_STIMS_MS
#ISIP_800_DISQUALIFIED = ['045', #did not complete task correctly
#                         '042', '048' #left-handed
#                         ] 

pickled_dbase = "c:/db_pickles/pickle - dbase - 2014-10-03b.pickle"
dbase = pd.read_pickle(pickled_dbase)

In [41]:
dbase = dbase.drop(excluded_all_tasks, level='pid')

In [42]:
db_isip5_allsubs = isolate_isip_task('ISIP_5')
db_isip8_allsubs = isolate_isip_task('ISIP_8')

db_isip5 = db_isip5_allsubs
db_isip8 = db_isip8_allsubs
#db_isip5 = db_isip5_allsubs.drop(ISIP_500_DISQUALIFIED)
#db_isip8 = db_isip8_allsubs.drop(ISIP_800_DISQUALIFIED)

#delete_columns_for_filter_debug_rerun(db_isip5)    
print('isip5:')
db_isip5 = run_isip_grouping_filter(db_isip5,
                                    minimum_interval = ISIP_5_MINIMUM_INT,
                                    #filter_stdev_radius = ISIP_5_FILTER_STDEV_RADIUS,
                                    maximum_interval = ISIP_5_MAXIMUM_INT,
                                    start_recording = isip_5_start_recording)

print('isip8:')
#delete_columns_for_filter_debug_rerun(db_isip8)
db_isip8 = run_isip_grouping_filter(db_isip8,
                                    minimum_interval = ISIP_8_MINIMUM_INT,
                                    #filter_stdev_radius = ISIP_8_FILTER_STDEV_RADIUS,
                                    maximum_interval = ISIP_8_MAXIMUM_INT,
                                    start_recording = isip_8_start_recording)


isip5:
isip8:

In [44]:
pid_list_800 = sorted(list(db_isip8.index.get_level_values('pid').unique()))
pid_list_500 = sorted(list(db_isip5.index.get_level_values('pid').unique()))

print('\n\n800')
for pid in pid_list_800:
    print(pid, end=",")
    assert db_isip8.ints_filtered.xs(pid).max() <= 1000
    assert db_isip8.ints_filtered.xs(pid).min() >= 600
    print(db_isip8.ints_filtered.xs(pid).count())

print('\n\n500')
for pid in pid_list_500:
    print(pid, end=",")
    assert db_isip5.ints_filtered.xs(pid).max() <= 650
    assert db_isip5.ints_filtered.xs(pid).min() >= 375
    print(db_isip5.ints_filtered.xs(pid).count())



800
015,134
016,112
017,109
018,114
019,116
020,111
021,121
022,109
024,124
025,105
026,117
027,112
028,122
029,115
030,108
032,117
033,116
034,114
035,116
036,123
037,119
038,112
039,120
040,118
041,119
043,132
044,111
046,124
047,110
048,13
049,109
051,126
052,111
053,105
054,113
055,125
056,125
057,119
058,113
059,117
060,111
061,114
062,113
063,114
064,120
065,94
066,110
067,122
068,112
069,115
071,125
072,112
073,80
074,113
075,119
076,114
077,117
078,115
079,112
080,122
081,114
082,118
083,112
084,112
085,114
086,111
087,89
089,120
090,119
091,131
092,118
093,117
094,81
095,115
096,105
097,115
098,118
099,124
100,118
101,126
102,114
103,116
104,113
105,121
107,113
108,118
109,122
110,114
111,103
112,119
113,114
114,117
115,119
116,113
117,78
118,109
119,111
120,121
121,105


500
015,118
016,107
017,113
018,117
019,116
020,108
021,117
022,118
024,115
025,106
026,110
027,116
028,113
029,119
030,110
032,114
033,114
034,115
035,114
036,113
037,116
038,116
039,121
040,111
041,113
043,118
044,114
046,114
047,118
048,108
049,32
051,112
052,115
053,113
054,111
055,116
056,115
057,114
058,109
059,117
060,122
061,111
062,115
063,115
064,113
065,110
066,118
067,127
068,119
069,122
071,121
072,119
073,127
074,116
075,113
076,115
077,114
078,117
079,115
080,118
081,119
082,111
083,109
084,117
085,114
086,106
087,112
089,117
090,120
091,119
092,112
093,123
094,109
095,113
096,110
097,119
098,115
099,112
100,114
101,118
102,115
103,112
104,108
105,115
107,114
108,103
109,109
110,115
111,114
112,113
113,115
114,117
115,111
116,113
117,108
118,113
119,108
120,120
121,115

In [33]:
db_isip8.ints_filtered.xs('015').hist()


Out[33]:
<matplotlib.axes.AxesSubplot at 0x12078748>

Export into flat participant-index-variables dataframe


In [45]:
#excluded: number of qualifying intervals is far below normal
# only for #49 on 500ms task, and just for #48 on 800ms task.

print(db_isip5.ints_filtered.xs('049').count())
print(db_isip8.ints_filtered.xs('048').count())

db_isip5 = db_isip5.drop('049', level='pid')
db_isip8 = db_isip8.drop('048', level='pid')


32
13

In [47]:
#outcome_dfs_isip5 = {}

outcomesdf_isip5_lag2 =     isip_outcomes_taskdf(db_isip5, 'lag2devsq')
#outcomesdf_isip5_avgprev2 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev2sq')
#outcomesdf_isip5_avgprev3 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev3sq')
outcomesdf_isip5_avgprev4 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev4sq')
#outcomesdf_isip5_avgprev_12 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev_12sq')

outcomesdf_isip8_lag2 =     isip_outcomes_taskdf(db_isip8, 'lag2devsq')
#outcomesdf_isip8_avgprev2 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev2sq')
#outcomesdf_isip8_avgprev3 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev3sq')
outcomesdf_isip8_avgprev4 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev4sq')
#outcomesdf_isip8_avgprev_12 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev_12sq')

Export results to CSV, then proceed to next notebooks


In [48]:
updated = "2014-10-12b"

db_isip5.to_csv('isip5_intervals - ' + updated + '.csv')
db_isip8.to_csv('isip8_intervals - ' + updated + '.csv')

outcomesdf_isip5_lag2.to_csv('dfo-isip5_lag2 - ' + updated + '.csv')
#outcomesdf_isip5_avgprev2.to_csv('dfo-isip5_avgprev2 - ' + updated + '.csv')
#outcomesdf_isip5_avgprev3.to_csv('dfo-isip5_avgprev3 - ' + updated + '.csv')
outcomesdf_isip5_avgprev4.to_csv('dfo-isip5_avgprev4 - ' + updated + '.csv')
#outcomesdf_isip5_avgprev_12.to_csv('dfo-isip5_avgprev_12 - ' + updated + '.csv')

outcomesdf_isip8_lag2.to_csv('dfo-isip8_lag2 - ' + updated + '.csv')
#outcomesdf_isip8_avgprev2.to_csv('dfo-isip8_avgprev2 - ' + updated + '.csv')
#outcomesdf_isip8_avgprev3.to_csv('dfo-isip8_avgprev3 - ' + updated + '.csv')
outcomesdf_isip8_avgprev4.to_csv('dfo-isip8_avgprev4 - ' + updated + '.csv')
#outcomesdf_isip8_avgprev_12.to_csv('dfo-isip8_avgprev_12 - ' + updated + '.csv')

prefix = "c:/db_pickles/pickle - "
outcomesdf_isip5_lag2.to_pickle(prefix + 'dfo-isip5_lag2 - ' + updated + '.pickle')
#outcomesdf_isip5_avgprev2.to_pickle(prefix + 'dfo-isip5_avgprev2 - ' + updated + '.pickle')
#outcomesdf_isip5_avgprev3.to_pickle(prefix + 'dfo-isip5_avgprev3 - ' + updated + '.pickle')
outcomesdf_isip5_avgprev4.to_pickle(prefix + 'dfo-isip5_avgprev4 - ' + updated + '.pickle')
#outcomesdf_isip5_avgprev_12.to_pickle(prefix + 'dfo-isip5_avgprev_12 - ' + updated + '.pickle')

outcomesdf_isip8_lag2.to_pickle(prefix + 'dfo-isip8_lag2 - ' + updated + '.pickle')
#outcomesdf_isip8_avgprev2.to_pickle(prefix + 'dfo-isip8_avgprev2 - ' + updated + '.pickle')
#outcomesdf_isip8_avgprev3.to_pickle(prefix + 'dfo-isip8_avgprev3 - ' + updated + '.pickle')
outcomesdf_isip8_avgprev4.to_pickle(prefix + 'dfo-isip8_avgprev4 - ' + updated + '.pickle')
#outcomesdf_isip8_avgprev_12.to_pickle(prefix + 'dfo-isip8_avgprev_12 - ' + updated + '.pickle')

(All below is not needed in next step - exploratory)

Post-export: testing results


In [50]:
#Confirming that the errors in drift calculations were due to lag-2 autocorrelations in the data.

def negative_autocorrelations_where_drift_calc_errors_occurred(outcomesdf):
    df = outcomesdf
    
    bycorr = df.ints_lag2corr.sort(inplace=False, ascending=True)
    ranks = bycorr.reset_index().reset_index()
    ranks['ranknum'] = ranks['index'] + 1
    ranks = ranks.set_index('pid')
    
    drifterrors = list(df[df.lag2devsq_drift.isnull()].index.values)
    dranks = {d: (ranks.loc[d].ranknum, ranks.loc[d].ints_lag2corr) for d in drifterrors}
    return dranks

[ 'isip500',
  negative_autocorrelations_where_drift_calc_errors_occurred(outcomesdf_isip5_lag2),
  'isip800',
  negative_autocorrelations_where_drift_calc_errors_occurred(outcomesdf_isip8_lag2)
]


Out[50]:
['isip500',
 {'016': (7.0, -0.18011009562769489),
  '022': (19.0, -0.077318401758675104),
  '033': (2.0, -0.36398911213809126),
  '036': (23.0, -0.066149949529849969),
  '044': (13.0, -0.11614305890152636),
  '052': (15.0, -0.10259161718889551),
  '053': (27.0, -0.044324588330401017),
  '056': (24.0, -0.061102016273541497),
  '061': (18.0, -0.083054167548957683),
  '077': (6.0, -0.21300595349211318),
  '078': (12.0, -0.12239673626254305),
  '084': (22.0, -0.068490437870905738),
  '085': (3.0, -0.31383563731802194),
  '089': (17.0, -0.085148815829851968),
  '092': (9.0, -0.15372357912950563),
  '096': (21.0, -0.076982421201613069),
  '097': (26.0, -0.044972277832436111),
  '101': (14.0, -0.1129626278118966),
  '105': (31.0, 0.012985936150983755),
  '107': (5.0, -0.24330962478469564),
  '109': (20.0, -0.07703639077522681),
  '110': (11.0, -0.14000336455025097),
  '111': (10.0, -0.15060305422862588),
  '115': (29.0, -0.023845574052905719),
  '116': (8.0, -0.17025624797714051)},
 'isip800',
 {'018': (4.0, -0.11811734350973402),
  '057': (2.0, -0.17087419981036001),
  '059': (5.0, -0.025420144525136891),
  '064': (3.0, -0.13645588774203291),
  '066': (7.0, -0.0092054999827699338),
  '116': (1.0, -0.22704639231499918)}]

Univariate inspections: cutoff & inverse methods of handling outliers


In [57]:
def sideplots(title, serieslist, namelist, **kwargs):
    
    assert len(serieslist) == len(namelist)
    count = len(serieslist)
    
    from matplotlib import pyplot as plt
    fig, axes = plt.subplots(nrows=count, ncols=3, **kwargs)
    
    #fig.set_figheight(10)
    #fig.set_figwidth(15)
    
    #fig.suptitle('t', fontsize=25)
    #plt.xlabel('xlabel', fontsize=18)
    #plt.ylabel('ylabel', fontsize=16)
    
    plots = [(namelist[i], serieslist[i]) for i in range(count)]
    
    for (i, (n, s)) in enumerate(plots):
        
        ax_hist = plt.subplot2grid((count, 3), (i, 0), colspan=2)
        ax_hist.set_title(n, fontsize=16)
        
        ax_line = plt.subplot2grid((count, 3), (i, 2), colspan=1)
        ax_line.set_title(n, fontsize=16)
        
        s.plot(ax=ax_line, linewidth=3)
        s.hist(ax=ax_hist, bins=20)
        
    fig.suptitle(title, fontsize=22)
    plt.show()
    #fig.tight_layout()

In [58]:
pids5 = sorted(set((p) for p in db_isip5.index.get_level_values('pid')))
isip5 = {p: db_isip5.xs(p) for p in pids5}

def isip5_hist(p):
    ints = isip5[p].int_raw
    ints_filt = isip5[p].ints_filtered
    #sideplots(ints, ints_filt, 
    #          plotname_top="pre-filter",
    #          plotname_bottom="post-filter"):
    #ints.hist()
    #plt.show()
    
for p in pids5:
    print(p)
    isip5_hist(p)
    #ri = raw_input()
    #if ri=="x": 
    break
    

#isip5['015']


015

In [60]:
pids8 = sorted(set((p) for p in db_isip8.index.get_level_values('pid')))
isip8 = {p: db_isip8.xs(p) for p in pids8}

pids8_gen = (p for p in pids8)

def next_twenty_800():

    ran = 0

    for p in pids8_gen:
        ran += 1
        if ran==10: break

        i_raw = isip8[p].int_raw

        c = i_raw.count()
        print(c)
        m = i_raw.mean()
        s = i_raw.std()

        i_abs_cutoff = i_raw[(i_raw >= 600) & (i_raw <= 1000)]
        i_sd_cutoff = i_raw[(i_raw <= m + 2.97*s) & (i_raw >= m - 2.97*s)]

        sideplots(title='P. %s' % p,
                  serieslist=[i_raw, i_abs_cutoff, i_sd_cutoff], 
                  namelist=['raw', 'abs cutoff', 'sd cutoff'])

In [74]:
next_twenty_800()


152
152
145
127
142
143
152
139

In [72]:
isip5['055'].ints_filtered.plot(marker="o")


Out[72]:
<matplotlib.axes.AxesSubplot at 0x1faeca58>

In [65]:
pids = (p for p in pids5)

def next_ten():

    ran = 0

    for p in pids:
        p='055'
        
        ran += 1
        if ran==10: break

        i_raw = isip5[p].int_raw

        c = i_raw.count()
        print(c)
        m = i_raw.mean()
        s = i_raw.std()

        i_abs_cutoff = i_raw[(i_raw >= 375) & (i_raw <= 650)]
        i_sd_cutoff = i_raw[(i_raw <= m + 2.97*s) & (i_raw >= m - 2.97*s)]

        sideplots(title='P. %s' % p,
                  serieslist=[i_raw, i_abs_cutoff, i_sd_cutoff], 
                  namelist=['raw', 'abs cutoff', 'sd cutoff'],
                  figsize=(15,8))
        break

In [66]:
next_ten()


283

In [94]:
pids5 = sorted(set((p) for p in db_isip5.index.get_level_values('pid')))
isip5 = {p: db_isip5.xs(p) for p in pids5}

pids8 = sorted(set((p) for p in db_isip8.index.get_level_values('pid')))
isip8 = {p: db_isip8.xs(p) for p in pids8}

for p in pids8:
    sideplots(title='P. %s' % p,
              serieslist=[isip8[p].ints_filtered], 
              namelist=['ints_filtered'],
              figsize=(19,5))



In [72]:
#df['drift'] = 100 * (1. / df.ints_mean) * np.sqrt(df.ints_variance - df.local_sq_abs)

#p017:
100 * (1. / 844.185541) * np.sqrt(1740.213950 - 1040.648841)

# problem: we have local_sq_abs values that are greater 
# than the total ints_variance. That shouldn't be.


Out[72]:
3.1331137388384755

In [19]:
db_isip5[db_isip5.task_ms >= isip_5_start_recording].to_csv('check_isip5_calcs_individual_pids_V4.csv')
outcomesdf_isip5.to_csv('check_isip5_calcs_outcomes_V4.csv')

In [20]:
db_isip5.xs('010', level='pid').ints.count()


Out[20]:
117

In [33]:
#db_isip8
test = outcomesdf_isip5_exp['exp_local_sq_abs'] - outcomesdf_isip5['local_sq_abs']
test.max()


Out[33]:
0.0

In [49]:
db_isip5.head().T


Out[49]:
pid 010
csv_line 8562 8564 8567 8569 8571
run_count 17 17 17 17 17
task_id 5 5 5 5 5
i 1 3 6 8 10
velocity 43 51 54 60 53
micros 2516643704 2517208392 2517742648 2518263648 2518755488
task_ms 409.128 973.816 1508.072 2029.072 2520.912
int_raw NaN 564.688 534.256 521 491.84
lag2dev NaN NaN NaN NaN NaN
lag2devsq NaN NaN NaN NaN NaN
movingmean_prev2 NaN NaN NaN NaN NaN
lagdev_avgprev2 NaN NaN NaN NaN NaN
lagdev_avgprev2sq NaN NaN NaN NaN NaN
movingmean_prev3 NaN NaN NaN NaN NaN
lagdev_avgprev3 NaN NaN NaN NaN NaN
lagdev_avgprev3sq NaN NaN NaN NaN NaN
movingmean_prev4 NaN NaN NaN NaN NaN
lagdev_avgprev4 NaN NaN NaN NaN NaN
lagdev_avgprev4sq NaN NaN NaN NaN NaN
int_filt1 NaN NaN NaN NaN NaN
int_max_exceeded NaN NaN NaN NaN NaN
ints NaN NaN NaN NaN NaN

In [104]:
#db_isip5[['task_ms', 'movingmean_prev4']].groupby(level='pid').plot(kind='scatter', x=0, y=1, figsize=(18,18))
db_isip5[['task_ms', 'movingmean_prev3']].plot(kind='scatter', x=0, y=1, figsize=(18,18))


Out[104]:
<matplotlib.axes.AxesSubplot at 0x55a17c50>

In [8]:
outcomesdf_isip8_avgprev4.sort('ints_variance', ascending=False)
#outcomesdf_isip5[30:]


Out[8]:
ints_count ints_mean ints_variance ints_stdev lagdev_avgprev4sq_sum lagdev_avgprev4sq_count lagdev_avgprev4sq_mean lagdev_avgprev4sq_local_sq_abs lagdev_avgprev4sq_local lagdev_avgprev4sq_drift
pid
048 15 916.353067 20867.669247 144.456461 1413.420112 2 706.710056 353.355028 2.051364 15.630241
049 131 677.489649 6168.530649 78.539994 640079.383384 119 5378.818348 2689.409174 7.654658 8.706270
073 142 651.723690 5567.003086 74.612352 240273.168960 138 1741.109920 870.554960 4.527251 10.515290
071 129 716.267969 4861.394718 69.723703 257113.257986 125 2056.906064 1028.453032 4.477305 8.643519
065 94 888.713532 3097.764909 55.657568 138198.965205 60 2303.316087 1151.658043 3.818563 4.963883
036 129 703.468155 2774.862029 52.676959 319606.785295 121 2641.378391 1320.689195 5.166017 5.420802
062 115 793.888557 2763.623445 52.570176 247692.904354 107 2314.886957 1157.443478 4.285390 5.048212
050 113 815.714973 2581.123395 50.804758 249713.540472 109 2290.949913 1145.474956 4.149104 4.644999
079 112 799.880643 2463.848699 49.637171 312241.442559 100 3122.414426 1561.207213 4.939751 3.756059
112 125 730.919936 2441.423644 49.410764 201279.851170 121 1663.469844 831.734922 3.945687 5.489100
015 137 702.533956 2430.437702 49.299470 273840.045376 125 2190.720363 1095.360182 4.710979 5.200989
069 115 798.398504 2392.578919 48.913995 294280.806528 111 2651.178437 1325.589219 4.560209 4.091291
089 122 752.853475 2259.368340 47.532813 82409.339912 118 698.384237 349.192118 2.482114 5.805321
025 105 873.708838 2234.780431 47.273464 84482.066848 101 836.456107 418.228054 2.340668 4.878173
080 130 694.809015 2221.811703 47.136098 97408.345467 122 798.429061 399.214531 2.875661 6.144406
020 111 819.063459 2114.370996 45.982290 198047.988630 103 1922.796006 961.398003 3.785594 4.145644
099 124 741.676419 2017.520730 44.916820 203068.116562 120 1692.234305 846.117152 3.921938 4.614650
043 132 701.174636 1999.840545 44.719577 99567.367423 128 777.870058 388.935029 2.812628 5.724121
086 111 823.767604 1966.517909 44.345438 168360.224507 107 1573.460042 786.730021 3.404930 4.169627
114 117 760.652444 1941.435481 44.061724 257821.200139 105 2455.440001 1227.720001 4.606419 3.512176
077 117 781.664718 1937.097322 44.012468 225173.202000 113 1992.683204 996.341602 4.038161 3.923900
023 114 810.928772 1901.405562 43.605109 221269.804635 110 2011.543678 1005.771839 3.910813 3.690477
087 89 922.643146 1882.635147 43.389344 66521.666558 63 1055.899469 527.949735 2.490362 3.989197
058 113 814.722372 1881.234906 43.373205 206449.352136 109 1894.030754 947.015377 3.777190 3.751585
105 123 744.315415 1875.171194 43.303247 172491.447506 119 1449.507962 724.753981 3.616915 4.556912
029 115 804.279235 1867.368872 43.213064 248772.631647 111 2241.194880 1120.597440 4.162152 3.397715
017 109 844.185541 1740.213950 41.715872 143206.684566 105 1363.873186 681.936593 3.093386 3.853557
121 105 870.896114 1675.054895 40.927435 155709.983251 101 1541.683002 770.841501 3.187984 3.452782
068 112 812.597286 1536.414100 39.197119 181396.409978 104 1744.196250 872.098125 3.634187 3.171845
072 112 825.148071 1482.495409 38.503187 169901.322776 108 1573.160396 786.580198 3.398910 3.197027
... ... ... ... ... ... ... ... ... ... ...
076 114 812.193018 727.804233 26.977847 94891.601549 110 862.650923 431.325462 2.557074 2.120008
081 114 779.827544 687.355127 26.217458 67012.581453 102 656.986093 328.493046 2.324152 2.429211
033 116 795.213966 677.040070 26.019994 87007.273412 112 776.850655 388.425328 2.478391 2.136364
063 114 800.623333 676.930632 26.017891 89612.764257 110 814.661493 407.330747 2.520840 2.050839
019 116 794.750414 676.364586 26.007010 80265.162034 112 716.653232 358.326616 2.381819 2.243927
028 122 753.348820 667.002049 25.826383 80715.285312 118 684.027842 342.013921 2.454855 2.392972
120 121 758.141686 666.742129 25.821350 84332.383746 117 720.789605 360.394802 2.504026 2.308642
042 119 756.064739 665.248659 25.792415 68057.120333 115 591.801046 295.900523 2.275170 2.541903
066 110 811.625927 644.061508 25.378367 68121.385873 98 695.116182 347.558091 2.296983 2.121578
082 118 777.074814 613.571344 24.770372 86968.594176 114 762.882405 381.441203 2.513338 1.960663
041 119 770.669849 579.816556 24.079380 76222.351133 115 662.803053 331.401527 2.362158 2.045128
098 118 777.291593 567.512488 23.822521 79510.471323 114 697.460275 348.730137 2.402485 1.902927
119 111 832.191856 550.818229 23.469517 65808.892054 107 615.036374 307.518187 2.107230 1.874337
075 119 777.781613 532.321319 23.072090 50763.319894 115 441.420173 220.710086 1.910088 2.269598
102 114 804.779053 531.380717 23.051697 47076.259142 110 427.965992 213.982996 1.817661 2.213733
110 114 807.640421 530.966500 23.042710 74662.088049 110 678.746255 339.373127 2.280976 1.713848
057 119 773.640000 523.642247 22.883231 66580.947648 115 578.964762 289.482381 2.199238 1.977959
027 112 822.994571 522.639055 22.861300 53110.808332 108 491.766744 245.883372 1.905319 2.021396
078 115 796.646783 516.727957 22.731651 65273.319254 111 588.047921 294.023961 2.152413 1.873261
034 114 812.306596 513.101841 22.651751 65086.298002 110 591.693618 295.846809 2.117451 1.814533
022 109 823.375670 511.808289 22.623180 32921.279468 101 325.953262 162.976631 1.550475 2.268349
031 109 829.454862 500.124746 22.363469 58005.872594 105 552.436882 276.218441 2.003704 1.804016
115 119 777.876975 492.878399 22.200865 66013.674230 115 574.031950 287.015975 2.177921 1.844496
047 110 841.013018 468.186673 21.637622 45479.634149 106 429.053152 214.526576 1.741558 1.893753
116 113 794.877381 431.205669 20.765492 57209.350172 105 544.850954 272.425477 2.076461 1.585251
070 119 771.457983 413.095668 20.324755 46867.943760 115 407.547337 203.773669 1.850384 1.875405
013 114 834.087930 338.276056 18.392283 36457.388143 98 372.014165 186.007082 1.635132 1.479428
026 117 787.017641 326.880811 18.079845 41266.683642 113 365.191891 182.595945 1.716964 1.526251
032 117 792.519453 280.617415 16.751639 32422.220689 113 286.922307 143.461153 1.511323 1.477740
045 0 NaN NaN NaN NaN 0 NaN NaN NaN NaN

112 rows × 10 columns


In [32]:
#don't need this: we'll get the full ID list when we merge them together later.

# set an index with full ID list to ensure that the index is consistent
# across all of the scales/index values/etc. that we'll be concatenating
# together once all processing steps are done.

#pid_list = ['010', '011', '012', '013', '014', '015', '016', '017', 
 #           '018', '019', '020', '021', '022', '023', '024', '025', 
  #          '026', '027', '028', '029', '030', '031', '032', '033', 
   #         '034', '035', '036', '037', '038', '039', '040', '041', 
    #        '042', '043', '044', '045', '046', '047', '048', '049', 
     #       '050', '051', '052', '053', '054', '055', '056', '057', 
      #      '058', '059', '060', '061', '062', '063', '064', '065', 
       #     '066', '067', '068', '069', '070', '071', '072', '073', 
        #    '074', '075', '076', '077', '078', '079', '080', '081', 
         #   '082', '083', '084', '085', '086', '087', '088', '089', 
          #  '090', '091', '092', '093', '094', '095', '096', '097', 
           # '098', '099', '100', '101', '102', '103', '104', '105', 
            #'106', '107', '108', '109', '110', '111', '112', '113', 
            #'114', '115', '116', '117', '118', '119', '120', '121']

#full_index = pd.DataFrame(index = pid_list)

In [31]:
#not using this anymore: it's easier to have all of the multiindex values
#(scales, isip5, isip8, etc) set up in the final assembly, rather than
#stick the ISIP5 / ISIP8 together in a multiindex already here. So we'll
#save the tasks under separate pickle files.

isip_out = pd.concat([scales,
                 isip_outcomes_taskdf(db_isip5), 
                 isip_outcomes_taskdf(db_isip8),
                 ], 
                axis=1, #defaults
                join='outer', 
                #join_axes=None, 
                #ignore_index=False, 
                keys=['scales','isip5','isip8'],
                #levels=None, 
                names=['task'], 
                #verify_integrity=False
                )
isip_out[31:37]


Out[31]:
task scales ... isip8
session_day date time session_sex exclusion_language exclusion_overall exclusion_rhythmmethodsgeneral exclusion_rhythmadminerror age wasivocab_rawscore ... ints_count ints_mean ints_variance ints_std lag2devsq_count lag2devsq_sum lag2devsq_mean local_sq_abs local drift
041 20140326 3/26/2014 1:00 PM f 0 0 0 0 23 40 ... 119 770.669849 579.816556 24.079380 117 102163.465424 873.192012 436.596006 2.711262 1.552867
042 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
043 20140328 3/28/2014 11:00 AM f 0 0 0 0 19 29 ... 132 701.174636 1999.840545 44.719577 130 188007.440288 1446.211079 723.105540 3.835082 5.095939
044 20140331 3/31/2014 10:30 AM f 0 0 0 0 18 40 ... 111 827.880505 1191.182247 34.513508 109 164717.853136 1511.172965 755.586482 3.320278 2.521008
045 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
046 20140331 3/31/2014 1:30 PM m 0 0 0 0 18 38 ... 124 740.406000 1015.423245 31.865706 122 137458.143136 1126.706091 563.353046 3.205682 2.871660

6 rows × 42 columns

Checking distributions of ISIP intervals


In [20]:
counts = {pid: str(db_isip5.xs(pid, level='pid').ints.count()) + " @500, "
             + str(db_isip8.xs(pid, level='pid').ints.count()) + " @800"
             for pid in pid_list}
counts


Out[20]:
{'010': '117 @500, 129 @800',
 '011': '118 @500, 128 @800',
 '012': '109 @500, 120 @800',
 '013': '117 @500, 114 @800',
 '014': '116 @500, 119 @800',
 '015': '118 @500, 137 @800',
 '016': '107 @500, 112 @800',
 '017': '113 @500, 109 @800',
 '018': '117 @500, 114 @800',
 '019': '116 @500, 116 @800',
 '020': '108 @500, 111 @800',
 '021': '117 @500, 121 @800',
 '022': '118 @500, 109 @800',
 '023': '118 @500, 114 @800',
 '024': '115 @500, 124 @800',
 '025': '106 @500, 105 @800',
 '026': '110 @500, 117 @800',
 '027': '116 @500, 112 @800',
 '028': '113 @500, 122 @800',
 '029': '119 @500, 115 @800',
 '030': '110 @500, 108 @800',
 '031': '115 @500, 109 @800',
 '032': '114 @500, 117 @800',
 '033': '114 @500, 116 @800',
 '034': '115 @500, 114 @800',
 '035': '114 @500, 116 @800',
 '036': '113 @500, 129 @800',
 '037': '116 @500, 119 @800',
 '038': '116 @500, 112 @800',
 '039': '121 @500, 120 @800',
 '040': '111 @500, 118 @800',
 '041': '113 @500, 119 @800',
 '043': '118 @500, 132 @800',
 '044': '114 @500, 111 @800',
 '046': '114 @500, 124 @800',
 '047': '118 @500, 110 @800',
 '049': '156 @500, 131 @800',
 '050': '120 @500, 113 @800',
 '051': '112 @500, 126 @800',
 '052': '115 @500, 111 @800',
 '053': '113 @500, 105 @800',
 '054': '111 @500, 113 @800',
 '055': '116 @500, 125 @800',
 '056': '115 @500, 125 @800',
 '057': '114 @500, 119 @800',
 '058': '109 @500, 113 @800',
 '059': '117 @500, 117 @800',
 '060': '122 @500, 111 @800',
 '061': '111 @500, 114 @800',
 '062': '115 @500, 115 @800',
 '063': '115 @500, 114 @800',
 '064': '113 @500, 120 @800',
 '065': '110 @500, 94 @800',
 '066': '118 @500, 110 @800',
 '067': '127 @500, 122 @800',
 '068': '119 @500, 112 @800',
 '069': '122 @500, 115 @800',
 '070': '116 @500, 119 @800',
 '071': '121 @500, 129 @800',
 '072': '119 @500, 112 @800',
 '073': '127 @500, 142 @800',
 '074': '116 @500, 113 @800',
 '075': '113 @500, 119 @800',
 '076': '115 @500, 114 @800',
 '077': '114 @500, 117 @800',
 '078': '117 @500, 115 @800',
 '079': '115 @500, 112 @800',
 '080': '118 @500, 130 @800',
 '081': '119 @500, 114 @800',
 '082': '111 @500, 118 @800',
 '083': '109 @500, 112 @800',
 '084': '117 @500, 112 @800',
 '085': '114 @500, 114 @800',
 '086': '106 @500, 111 @800',
 '087': '112 @500, 89 @800',
 '088': '118 @500, 119 @800',
 '089': '117 @500, 122 @800',
 '090': '120 @500, 119 @800',
 '091': '119 @500, 131 @800',
 '092': '112 @500, 120 @800',
 '093': '123 @500, 117 @800',
 '094': '113 @500, 81 @800',
 '095': '113 @500, 115 @800',
 '096': '110 @500, 105 @800',
 '097': '119 @500, 115 @800',
 '098': '115 @500, 118 @800',
 '099': '112 @500, 124 @800',
 '100': '114 @500, 118 @800',
 '101': '118 @500, 126 @800',
 '102': '115 @500, 114 @800',
 '103': '112 @500, 116 @800',
 '104': '108 @500, 113 @800',
 '105': '115 @500, 123 @800',
 '106': '115 @500, 107 @800',
 '107': '114 @500, 113 @800',
 '108': '103 @500, 118 @800',
 '109': '109 @500, 122 @800',
 '110': '115 @500, 114 @800',
 '111': '114 @500, 103 @800',
 '112': '113 @500, 125 @800',
 '113': '115 @500, 114 @800',
 '114': '117 @500, 117 @800',
 '115': '111 @500, 119 @800',
 '116': '113 @500, 113 @800',
 '117': '108 @500, 78 @800',
 '118': '113 @500, 109 @800',
 '119': '108 @500, 111 @800',
 '120': '120 @500, 121 @800',
 '121': '115 @500, 105 @800'}

In [53]:
for pid in pid_list:
    try:
        plt.figure(figsize=(14,3))
        db_isip8.loc[pid].ints.hist(bins=40)
        print(pid)
        plt.show()
    except: pass
    
#outliers: 054, one point >1000
# 036, one point > 1000
# 056, two points > 900
# 062, oine point > 1100
# 048, two points < 650 (almost whole set is >1000)


010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
043
044
046
047
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

In [14]:
for pid in pid_list:
    try:
        plt.figure(figsize=(14,3))
        db_isip5.loc[pid].ints.hist(bins=40)
        print(pid)
        plt.show()
    except: pass
    
#outliers: 054, one point >1000
# 036, one point > 1000
# 056, two points > 900
# 062, oine point > 1100
# 048, two points < 650 (almost whole set is >1000)


010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
043
044
046
047
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

In [ ]:
dfo.to_csv('isip_data_output.csv')
!isip_data_output.csv

In [73]:
#for viewing data that will be kept...
data = dfo.loc[dfo.scales.exclusion_language==0]

In [28]:
#pd.set_option('display.multi_sparse', False)
#dfo['isip5']['ints_mean']
#dfo['isip5','ints_mean']
#dfo.loc(axis=1)['isip5','ints_mean']
#dfo.loc(axis=1)[('isip5','ints_mean')]

dfo['isip5'].drift[dfo['isip5'].drift > 6]


Out[28]:
049    9.978668
055    8.000231
Name: drift, dtype: float64

In [42]:
dfo.head()


Out[42]:
task scales scales scales scales scales scales scales scales scales scales ... isip8 isip8 isip8 isip8 isip8 isip8 isip8 isip8 isip8 isip8
session_day date time session_sex exclusion_language exclusion_overall exclusion_rhythmmethodsgeneral exclusion_rhythmadminerror age wasivocab_rawscore ... ints_count ints_mean ints_variance ints_std lag2devsq_count lag2devsq_sum lag2devsq_mean local_sq_abs local drift
pid
010 20140226 2/26/2014 1:20 PM m 1 0 0 0 18 29 ... 129 774.104930 1138.331372 33.739167 127 153047.935504 1205.101854 602.550927 3.171006 2.990154
011 20140226 2/26/2014 2:40 PM f 0 0 0 0 19 40 ... 128 776.994000 883.209799 29.718846 126 208781.623552 1656.997012 828.498506 3.704488 0.951965
012 20140226 2/26/2014 4:00 PM f 0 0 0 0 57 33 ... 120 831.791733 986.297632 31.405376 118 176171.441216 1492.978315 746.489158 3.284711 1.861734
013 20140227 2/27/2014 8:00 AM m 0 0 0 1 21 42 ... 114 834.087930 338.276056 18.392283 106 60632.582384 572.005494 286.002747 2.027558 0.866818
014 20140228 2/28/2014 9:10 AM f 1 0 0 0 25 13 ... 119 845.244504 1018.963019 31.921200 117 176923.391440 1512.165739 756.082869 3.253137 1.918212

5 rows × 42 columns


In [44]:
dfo.xs('scales', level='task', axis=1).to_json('isip_data_output_scales.json')
dfo.xs('isip5', level='task', axis=1).to_json('isip_data_output_isip5.json')
dfo.xs('isip8', level='task', axis=1).to_json('isip_data_output_isip8.json')

Preliminary scatterplot, isip500 / 800


In [17]:
outcomesdf_isip5


Out[17]:
ints_count ints_mean ints_variance ints_stdev lag2devsq_sum lag2devsq_count lag2devsq_mean lag2devsq_local_sq_abs lag2devsq_local lag2devsq_drift
pid
010 117 484.570188 431.516467 20.772974 61689.291664 113 545.922935 272.961468 3.409526 2.598563
011 118 485.356814 231.312109 15.208948 59610.206416 116 513.881090 256.940545 3.302594 NaN
012 109 528.153248 619.335497 24.886452 134842.083888 107 1260.206391 630.103196 4.752760 NaN
013 117 492.018496 191.325997 13.832064 45051.603072 115 391.753070 195.876535 2.844525 NaN
014 116 495.567621 426.417128 20.649870 100343.064752 114 880.202322 440.101161 4.233244 NaN
015 118 482.560712 503.989402 22.449708 71439.642368 114 626.663530 313.331765 3.668177 2.861377
016 107 534.489682 337.322524 18.366342 81375.980656 105 775.009340 387.504670 3.682978 NaN
017 113 507.232283 372.501872 19.300308 76301.810240 111 687.403696 343.701848 3.654972 1.058009
018 117 490.434838 169.564390 13.021689 33365.922272 115 290.138455 145.069227 2.455875 1.009157
019 116 496.082310 87.338616 9.345513 19395.184096 114 170.133194 85.066597 1.859199 0.303845
020 108 527.804963 589.663762 24.282993 95309.265760 106 899.144017 449.572008 4.017225 2.242502
021 117 493.456479 491.635470 22.172854 103278.063904 115 898.070121 449.035060 4.294289 1.322690
022 118 491.002881 266.638426 16.329067 66905.256720 116 576.769454 288.384727 3.458614 NaN
023 118 487.401525 209.743705 14.482531 47419.766096 116 408.791087 204.395544 2.933248 0.474477
024 115 497.052209 346.236721 18.607437 73709.621120 113 652.297532 326.148766 3.633339 0.901708
025 106 537.173396 495.696599 22.264245 59565.535456 104 572.745533 286.372767 3.150296 2.693363
026 110 522.279564 484.524188 22.011910 99789.086832 108 923.973026 461.986513 4.115396 0.908974
027 116 492.205552 227.230850 15.074178 47722.113664 114 418.615032 209.307516 2.939313 0.860128
028 113 499.047115 340.013136 18.439445 73322.526480 109 672.683729 336.341865 3.674929 0.383943
029 119 480.923597 383.615111 19.586095 71476.612096 117 610.911214 305.455607 3.634109 1.838293
030 110 526.321818 152.433529 12.346397 26781.463872 108 247.976517 123.988259 2.115626 1.013336
031 115 503.174574 222.846559 14.928046 67330.682048 113 595.846744 297.923372 3.430312 NaN
032 114 504.738105 123.587386 11.116986 23833.411232 112 212.798315 106.399157 2.043633 0.821390
033 114 501.366632 280.591619 16.750869 85712.136320 112 765.286931 382.643466 3.901591 NaN
034 115 501.191200 225.441641 15.014714 42139.552128 113 372.916391 186.458195 2.724503 1.245767
035 114 504.927754 396.148055 19.903468 82621.804784 112 737.694686 368.847343 3.803593 1.034803
036 113 506.245097 865.523721 29.419785 209699.127152 111 1889.181326 944.590663 6.071011 NaN
037 116 497.986897 609.504061 24.688136 100630.167984 114 882.720772 441.360386 4.218701 2.603888
038 116 496.498345 194.346045 13.940805 36632.072896 114 321.333973 160.666986 2.552969 1.168859
039 121 475.983207 383.342106 19.579124 64152.844432 119 539.099533 269.549767 3.449275 2.241119
... ... ... ... ... ... ... ... ... ... ...
092 112 508.889214 422.469593 20.554065 107997.824320 110 981.798403 490.899201 4.353845 NaN
093 123 465.726732 412.370742 20.306914 73061.555584 121 603.814509 301.907254 3.730831 2.256723
094 113 509.198549 705.218200 26.555945 144728.472992 111 1303.860117 651.930059 5.014335 1.433600
095 113 496.906018 337.276204 18.365081 71364.353344 109 654.718838 327.359419 3.641147 0.633740
096 110 522.939891 369.646599 19.226196 85088.745520 108 787.858755 393.929377 3.795399 NaN
097 119 486.892672 450.793265 21.231893 110019.624912 117 940.338674 470.169337 4.453423 NaN
098 115 499.021948 232.085405 15.234350 50146.397600 113 443.773430 221.886715 2.985011 0.639960
099 112 514.290429 291.855066 17.083766 57254.421216 110 520.494738 260.247369 3.136785 1.093171
100 114 504.409088 344.916823 18.571936 69286.214752 112 618.626917 309.313459 3.486715 1.182940
101 118 483.521424 276.944489 16.641649 70894.575456 116 611.160133 305.580067 3.615320 NaN
102 115 503.016904 406.461517 20.160891 85176.583648 113 753.775077 376.887538 3.859431 1.081116
103 112 514.388250 478.128078 21.866140 90473.666432 110 822.487877 411.243938 3.942382 1.589903
104 108 531.871444 481.559774 21.944470 69251.269280 106 653.313861 326.656931 3.398124 2.340039
105 115 503.235965 373.097980 19.315744 84927.682624 113 751.572413 375.786206 3.852111 NaN
106 115 497.394748 259.228145 16.100564 29725.450832 113 263.057087 131.528544 2.305732 2.271923
107 114 507.268667 240.647244 15.512809 64212.007584 112 573.321496 286.660748 3.337691 NaN
108 103 559.819107 651.107208 25.516802 106842.165168 101 1057.843219 528.921610 4.108166 1.974523
109 109 531.315853 407.849711 20.195289 92851.086032 107 867.767159 433.883580 3.920431 NaN
110 115 495.863270 151.669191 12.315405 38776.196176 113 343.152179 171.576089 2.641596 NaN
111 114 507.458877 375.364702 19.374331 95714.069888 112 854.589910 427.294955 4.073456 NaN
112 113 512.244920 667.787840 25.841591 53438.306144 111 481.426181 240.713091 3.028811 4.034357
113 115 497.162678 139.779273 11.822828 30864.131664 113 273.133909 136.566954 2.350576 0.360505
114 117 482.347009 905.847583 30.097302 199473.923248 113 1765.255958 882.627979 6.159270 0.999005
115 111 518.681153 386.652012 19.663469 84921.336288 109 779.094828 389.547414 3.805219 NaN
116 113 508.455540 214.243404 14.637056 55277.700768 111 497.997304 248.998652 3.103455 NaN
117 108 528.354889 484.106776 22.002427 66093.075616 106 623.519581 311.759791 3.341830 2.484712
118 113 505.189274 518.832753 22.777901 87404.963440 111 787.432103 393.716052 3.927692 2.214132
119 108 533.036630 144.675042 12.028094 21337.248112 106 201.294794 100.647397 1.882107 1.244817
120 120 476.299600 261.461009 16.169756 55264.225152 118 468.340891 234.170446 3.212816 1.096796
121 115 499.836139 226.263531 15.042059 43008.093712 113 380.602599 190.301300 2.759899 1.199764

109 rows × 10 columns


In [11]:
x = outcomesdf_isip5['lag2devsq_mean']
y = outcomesdf_isip5_avgprev3['lagdev_avgprev3sq_mean']
#ids = sorted(set(x.index).intersection(set(y.index)))
#df = pd.DataFrame(index = ids)
sx = x.apply(lambda n: np.sqrt(n))
sy = y.apply(lambda n: np.sqrt(n))
dfo = pd.concat([sx, sy], axis=1, join='outer', 
                keys = ['x' + ' sqrt of ' + x.name,
                        'y' + ' sqrt of ' + y.name])
dfo.plot(x=0, y=1, kind='scatter', figsize=(12,12))


Out[11]:
<matplotlib.axes.AxesSubplot at 0xc828f98>

In [22]:
plt.figure(figsize=(8,5))
plt.scatter(data.isip8.drift, data.isip5.drift)
plt.show()

print(data.isip8.drift[data.isip8.drift > 6])
print(data.isip5.drift[data.isip5.drift > 6])

data_rem_drift_ol = dfo.drop(['049', '055', '071', '073'])

plt.figure(figsize=(8,5))
plt.scatter(data_rem_drift_ol.isip8.drift, data_rem_drift_ol.isip5.drift)
plt.show()


pid
071     8.124596
073    10.119067
Name: drift, dtype: float64
pid
049    9.978668
055    8.000231
Name: drift, dtype: float64

In [23]:
data.scales.fsiq2
data.isip5.local


Out[23]:
pid
011    3.302594
012    4.752760
013    2.844525
015    3.668177
016    3.682978
017    3.654972
018    2.455875
019    1.859199
020    4.017225
021    4.294289
022    3.458614
024    3.633339
025    3.150296
026    4.115396
027    2.939313
...
107    3.337691
108    4.108166
109    3.920431
110    2.641596
111    4.073456
112    3.028811
113    2.350576
114    6.159270
115    3.805219
116    3.103455
117    3.341830
118    3.927692
119    1.882107
120    3.212816
121    2.759899
Name: local, Length: 102, dtype: float64

In [24]:
plt.figure(figsize=(8,5))
plt.scatter(data.isip8.local, data.isip5.local)
plt.show()

print(data.isip8.drift[data.isip8.local > 9])
print(data.isip5.drift[data.isip5.local > 9])

data_rem_local_ol = dfo.drop(['049'])

plt.figure(figsize=(8,5))
plt.scatter(data_rem_local_ol.isip8.local, data_rem_local_ol.isip5.local)
plt.show()


pid
049    5.272842
Name: drift, dtype: float64
Series([], name: drift, dtype: float64)

In [25]:
plt.figure(figsize=(8,5))
plt.scatter(data.scales.wasivocab_rawscore, data.scales.wasimatrix_rawscore)
plt.show()



In [26]:
plt.figure(figsize=(8,5))
plt.scatter(data.scales.wasivocab_tscore, data.isip8.local)
plt.show()



In [27]:
compare_df = pd.concat([data.scales.wasivocab_tscore, 
                        data.isip8.local], 
                       axis=1)
compare_df = compare_df[compare_df.wasivocab_tscore.notnull()]
compare_df.loc['049'].local = 7

plt.figure(figsize=(8,5))
plt.scatter(compare_df.wasivocab_tscore, compare_df.local)
plt.show()

compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats


r, p = stats.pearsonr(compare_arr[0], compare_arr[1])
print("r: {r}, p: {p}".format(r=r, p=p))


C:\Applications\_Data analysis\Anaconda\lib\site-packages\pandas\core\generic.py:1858: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  self[name] = value
r: -0.150423484883, p: 0.131278387372

In [28]:
compare_df = pd.concat([data.scales.wasimatrix_tscore, 
                        data.isip8.local], 
                       axis=1)
compare_df = compare_df[compare_df.wasimatrix_tscore.notnull()]
compare_df.loc['049'].local = 7

plt.figure(figsize=(8,5))
plt.scatter(compare_df.wasimatrix_tscore, compare_df.local)
plt.show()

compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats


r, p = stats.pearsonr(compare_arr[0], compare_arr[1])
print("r: {r}, p: {p}".format(r=r, p=p))


r: -0.152668983147, p: 0.127469736219

In [29]:
compare_df = pd.concat([data.scales.fsiq2, 
                        data.isip8.local], 
                       axis=1)
compare_df = compare_df[compare_df.fsiq2.notnull()]
compare_df.loc['049'].local = 7

plt.figure(figsize=(8,5))
plt.scatter(compare_df.fsiq2, compare_df.local)
plt.show()

compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats


r, p = stats.pearsonr(compare_arr[0], compare_arr[1])
print("r: {r}, p: {p}".format(r=r, p=p))


r: -0.204788026516, p: 0.0399464145313

In [30]:
def rtest(df_columns):
    from scipy import stats
    import matplotlib.pylab as plt
    print(df_columns.columns[0])
    print(df_columns.columns[1])
    plt.figure(figsize=(3,3))
    plt.scatter(compare_df.fsiq2, compare_df.local)
    plt.show()

rtest(compare_df)


fsiq2
local

In [31]:
compare_df = pd.concat([data.scales.fsiq2, 
                        data.isip5.local], 
                       axis=1)
compare_df = compare_df[compare_df.fsiq2.notnull()]

assert len(compare_df[compare_df.fsiq2.isnull()]) == 0

In [32]:
compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats
stats.pearsonr(compare_arr[0], compare_arr[1])


Out[32]:
(-0.012201312518071359, 0.90361193985416333)

Testing / visualizing - not needed in final 'product'

Plotting interval results after filtering steps


In [175]:
def scatter_tooltips(df, x_col, y_col, 
                 size_col=None, 
                 color_col=None, 
                 show_all_cols=False,
                 fig_size=(8, 5)):
    #import matplotlib.pyplot as plt
    #import numpy as np
    import pandas as pd
    import mpld3
    from mpld3 import plugins

    #x = df[x_col]
    #y = df[y_col]
    df_info = [x_col, y_col]
    #for arg in args:
    #    df_info.append(arg)

    # Define some CSS to control our custom labels
    css = """
    table { border-collapse: collapse; }
    th { color: #ffffff; background-color: #000000; }
    td { background-color: #cccccc; }
    table, th, td { font-family:Arial, Helvetica, sans-serif;
                    border: 1px solid black; text-align: right; }
    """

    fig, ax = plt.subplots()
    fig.set_size_inches(fig_size)
    ax.grid(True, alpha=0.3)

    labels = []
    for row in df.iterrows():
        index, series = row
        pid = index
        label = pd.DataFrame(series)
        labels.append(str(label.to_html()))

    points = ax.plot(df[x_col], 
                     df[y_col], 
                     'o', 
                     color='b',
                     markeredgecolor='k', 
                     markersize=8, 
                     markeredgewidth=1, 
                     alpha=.6)
    ax.set_xlabel(x_col)
    ax.set_ylabel(y_col)
    ax.set_title(x_col + ' . ' + y_col, size=16)
    tooltip = plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    plugins.connect(fig, tooltip)

    return mpld3.display()

In [176]:
scatter_tooltips(data, 'isip8_sq2dev_mean_sqrt', 'isip5_sq2dev_mean_sqrt',
                 fig_size=(12, 7.5))


Out[176]:

In [132]:
def d3plot(x, y, size=(10,6)):
    import mpld3

    fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
    fig.set_size_inches((8,5) )
    scatter = ax.scatter(x, y,
                         #c=np.random.random(size=N),
                         s=40, #size
                         alpha=0.5,
                         cmap=plt.cm.jet)
    ax.grid(color='white', linestyle='solid')
    ax.set_title("Scatter Plot (with tooltips!)", size=10)
    labels = ['{0}'.format(pid) for pid in x.index]
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)
    return mpld3.display()

In [133]:
d3plot(data['isip5_sq2devsum'], data['isip8_sq2devsum'])


Out[133]:

intervals after filtering


In [26]:
sset = db_isip8.xs(['056', 'tap_r'], level=['pid', 'stamp_type'])
print(sset.ints.mean())
print(sset.ints.std())


728.982048
37.2019991333

In [18]:
for pid in pid_list:
    try:
        sset = db_isip5.xs([pid, 'tap_r'], level=['pid', 'stamp_type'])
        data = sset.ints
        if data.min() > 400 or data.max() < 600: continue        
        print(pid)
        plt.figure(figsize=(13,6))
        data.hist(bins=100)
        #annotating non-midpoint values
        caption_y_increment = 0.2
        median = data.median()
        prev_ypos = 0
        for idx, value in enumerate(data):
            if np.abs(value - median) > 150:
                caption = str(data.index[idx]) + ": " + str(value.round(1))
                plt.annotate(caption, (value, prev_ypos + caption_y_increment))
                prev_ypos += caption_y_increment
        plt.show()
    except:
        print("error....")

#High outliers:
#p049 - will need to remove manually


049
055

In [99]:
taps = db_isip5.xs('tap_r', level='stamp_type').ints
pmeans = taps.groupby(level='pid').mean()

data = pmeans

##############################

plt.figure(figsize=(13,6))
data.hist(bins=25)

#annotating non-midpoint values
caption_y_increment = 1
median = data.median()
prev_ypos = 0
for idx, value in enumerate(data):
    if np.abs(value - median) > 25:
        caption = str(data.index[idx]) + ": " + str(value.round(1))
        plt.annotate(caption, (value, prev_ypos + caption_y_increment))
        prev_ypos += caption_y_increment
plt.show()

#why is pid 049's mean so low?



In [100]:
sset = db_isip8.xs(['048', 'tap_r'], level=['pid', 'stamp_type'])
data = sset.ints.mean()
data - data*0.5


Out[100]:
524.87915909090907

In [79]:
sset[60:]


Out[79]:
run_count task_id i channel pitch velocity micros task_ms int_raw int_filt1 int_max_exceeded int_lag2dev ints
task_name csv_line
ISIP_8 9628 21 3 117 1 48 92 2373733160 54572.428 1076.716 1076.716 False 47.040 1076.716
9629 21 3 118 1 48 117 2374789772 55629.040 1056.612 1056.612 False 64.404 1056.612
9630 21 3 119 1 48 100 2375801208 56640.476 1011.436 1011.436 False -65.280 1011.436
9631 21 3 120 1 48 111 2376926120 57765.388 1124.912 1124.912 False 68.300 1124.912
9632 21 3 121 1 48 111 2378075728 58914.996 1149.608 1149.608 False 138.172 1149.608
9633 21 3 122 1 48 121 2379113188 59952.456 1037.460 1037.460 False -87.452 1037.460
9634 21 3 123 1 48 97 2380251568 61090.836 1138.380 1138.380 False -11.228 1138.380
9635 21 3 124 1 48 127 2381400428 62239.696 1148.860 1148.860 False 111.400 1148.860
9636 21 3 125 1 48 79 2382490936 63330.204 1090.508 1090.508 False -47.872 1090.508
9637 21 3 126 1 48 105 2383587036 64426.304 1096.100 1096.100 False -52.760 1096.100
9638 21 3 127 1 48 127 2384698024 65537.292 1110.988 1110.988 False 20.480 1110.988
9639 21 3 128 1 48 111 2385825580 66664.848 1127.556 1127.556 False 31.456 1127.556
9640 21 3 129 1 48 112 2386941280 67780.548 1115.700 1115.700 False 4.712 1115.700
9641 21 3 130 1 48 127 2388091424 68930.692 1150.144 1150.144 False 22.588 1150.144
9642 21 3 131 1 48 108 2389179152 70018.420 1087.728 1087.728 False -27.972 1087.728
9644 21 3 133 1 48 127 2390218064 71057.332 1038.912 1038.912 False -111.232 1038.912
9645 21 3 134 1 48 121 2391315900 72155.168 1097.836 1097.836 False 10.108 1097.836
9646 21 3 135 1 48 103 2392416444 73255.712 1100.544 1100.544 False 61.632 1100.544
9647 21 3 136 1 48 110 2393596776 74436.044 1180.332 1180.332 False 82.496 1180.332
9648 21 3 137 1 48 117 2394694828 75534.096 1098.052 1098.052 False -2.492 1098.052
9649 21 3 138 1 48 79 2395813540 76652.808 1118.712 1118.712 False -61.620 1118.712
9651 21 3 140 1 48 19 2396327468 77166.736 513.928 513.928 False -584.124 513.928
9652 21 3 141 1 48 97 2396959476 77798.744 632.008 632.008 False -486.704 632.008
9653 21 3 142 1 48 127 2398009728 78848.996 1050.252 1050.252 False 536.324 1050.252
9654 21 3 143 1 48 117 2399073920 79913.188 1064.192 1064.192 False 432.184 1064.192
9656 21 3 145 1 48 127 2400149340 80988.608 1075.420 1075.420 False 25.168 1075.420
9657 21 3 146 1 48 123 2401271772 82111.040 1122.432 1122.432 False 58.240 1122.432
9658 21 3 147 1 48 100 2402320156 83159.424 1048.384 1048.384 False -27.036 1048.384
9659 21 3 148 1 48 113 2403437916 84277.184 1117.760 1117.760 False -4.672 1117.760
9660 21 3 149 1 48 124 2404513744 85353.012 1075.828 1075.828 False 27.444 1075.828
... ... ... ... ... ... ... ... ... ... ... ... ... ...
9663 21 3 152 1 48 106 2407753000 88592.268 1097.376 1097.376 False 33.196 1097.376
9664 21 3 153 1 48 127 2408894168 89733.436 1141.168 1141.168 False 63.468 1141.168
9665 21 3 154 1 48 127 2409955116 90794.384 1060.948 1060.948 False -36.428 1060.948
9666 21 3 155 1 48 98 2411092752 91932.020 1137.636 1137.636 False -3.532 1137.636
9667 21 3 156 1 48 103 2412197028 93036.296 1104.276 1104.276 False 43.328 1104.276
9668 21 3 157 1 48 95 2413307044 94146.312 1110.016 1110.016 False -27.620 1110.016
9669 21 3 158 1 48 127 2414405856 95245.124 1098.812 1098.812 False -5.464 1098.812
9670 21 3 159 1 48 107 2415495400 96334.668 1089.544 1089.544 False -20.472 1089.544
9671 21 3 160 1 48 127 2416539816 97379.084 1044.416 1044.416 False -54.396 1044.416
9672 21 3 161 1 48 107 2417615288 98454.556 1075.472 1075.472 False -14.072 1075.472
9673 21 3 162 1 48 127 2418704512 99543.780 1089.224 1089.224 False 44.808 1089.224
9674 21 3 163 1 48 119 2419783108 100622.376 1078.596 1078.596 False 3.124 1078.596
9675 21 3 164 1 48 118 2420889492 101728.760 1106.384 1106.384 False 17.160 1106.384
9677 21 3 166 1 48 93 2421971488 102810.756 1081.996 1081.996 False 3.400 1081.996
9678 21 3 167 1 48 92 2423056596 103895.864 1085.108 1085.108 False -21.276 1085.108
9679 21 3 168 1 48 127 2424098628 104937.896 1042.032 1042.032 False -39.964 1042.032
9680 21 3 169 1 48 127 2425119560 105958.828 1020.932 1020.932 False -64.176 1020.932
9681 21 3 170 1 48 127 2426156576 106995.844 1037.016 1037.016 False -5.016 1037.016
9682 21 3 171 1 48 127 2427229152 108068.420 1072.576 1072.576 False 51.644 1072.576
9683 21 3 172 1 48 127 2428292348 109131.616 1063.196 1063.196 False 26.180 1063.196
9684 21 3 173 1 48 114 2429364964 110204.232 1072.616 1072.616 False 0.040 1072.616
9685 21 3 174 1 48 90 2430461228 111300.496 1096.264 1096.264 False 33.068 1096.264
9686 21 3 175 1 48 127 2431537956 112377.224 1076.728 1076.728 False 4.112 1076.728
9687 21 3 176 1 48 127 2432566892 113406.160 1028.936 1028.936 False -67.328 1028.936
9688 21 3 177 1 48 92 2433586864 114426.132 1019.972 1019.972 False -56.756 1019.972
9689 21 3 178 1 48 123 2434660704 115499.972 1073.840 1073.840 False 44.904 1073.840
9690 21 3 179 1 48 119 2435683356 116522.624 1022.652 1022.652 False 2.680 1022.652
9692 21 3 181 1 48 127 2436797604 117636.872 1114.248 1114.248 False 40.408 1114.248
9693 21 3 182 1 48 121 2437885104 118724.372 1087.500 1087.500 False 64.848 1087.500
9694 21 3 183 1 48 100 2439016980 119856.248 1131.876 1131.876 False 17.628 1131.876

62 rows × 13 columns


In [65]:
sset = db_isip5.xs(['055', 'tap_r'], level=['pid', 'stamp_type'])
sset_ints = sset.ints

sset_ints.hist(bins=100, figsize=(14,5))


Out[65]:
<matplotlib.axes.AxesSubplot at 0x27494d68>

In [98]:
taps = db_isip8.xs('tap_r', level='stamp_type').ints
pmeans = taps.groupby(level='pid').mean()

data = pmeans

##############################

plt.figure(figsize=(13,6))
data.hist(bins=25)

#annotating non-midpoint values
caption_y_increment = 0.8
median = data.median()
prev_ypos = 0
for idx, value in enumerate(data):
    if np.abs(value - median) > 60:
        caption = str(data.index[idx]) + ": " + str(value.round(1))
        plt.annotate(caption, (value, prev_ypos + caption_y_increment))
        prev_ypos += caption_y_increment
plt.show()

#Good distribution-- 048 is a bit high



In [60]:
ISIP5_DATASTART = 22000

sset = db_isip5.xs(['055', 'tap_r'], level=['pid', 'stamp_type'])
sset[sset.task_ms > ISIP5_DATASTART]

# a double-interval has snuck through here.


Out[60]:
run_count task_id i channel pitch velocity micros task_ms int_raw
task_name csv_line
ISIP_5 9471 17 5 113 1 48 65 1732756792 22613.980 2091.876
9472 17 5 114 1 48 51 1732790128 22647.316 33.336
9473 17 5 115 1 48 88 1733174484 23031.672 384.356
9474 17 5 116 1 48 60 1733222708 23079.896 48.224
9475 17 5 117 1 48 87 1733628264 23485.452 405.556
9476 17 5 118 1 48 53 1733682880 23540.068 54.616
9477 17 5 119 1 48 82 1734103016 23960.204 420.136
9478 17 5 120 1 48 52 1734154328 24011.516 51.312
9479 17 5 121 1 48 76 1734594460 24451.648 440.132
9480 17 5 122 1 48 61 1734631988 24489.176 37.528
9481 17 5 123 1 48 70 1735060468 24917.656 428.480
9482 17 5 124 1 48 50 1735104280 24961.468 43.812
9483 17 5 125 1 48 64 1735558120 25415.308 453.840
9484 17 5 126 1 48 52 1735593328 25450.516 35.208
9485 17 5 127 1 48 54 1736072748 25929.936 479.420
9486 17 5 128 1 48 46 1736106608 25963.796 33.860
9487 17 5 129 1 48 74 1736565836 26423.024 459.228
9488 17 5 130 1 48 54 1736604256 26461.444 38.420
9489 17 5 131 1 48 75 1737178084 27035.272 573.828
9490 17 5 132 1 48 64 1737204848 27062.036 26.764
9491 17 5 133 1 48 76 1738085532 27942.720 880.684
9492 17 5 134 1 48 66 1738110116 27967.304 24.584
9493 17 5 135 1 48 72 1738631120 28488.308 521.004
9494 17 5 136 1 48 58 1738656776 28513.964 25.656
9495 17 5 137 1 48 71 1739140532 28997.720 483.756
9496 17 5 138 1 48 57 1739166840 29024.028 26.308
9497 17 5 139 1 48 73 1739618944 29476.132 452.104
9498 17 5 140 1 48 54 1739642048 29499.236 23.104
9499 17 5 141 1 48 75 1740106948 29964.136 464.900
9500 17 5 142 1 48 56 1740130408 29987.596 23.460
... ... ... ... ... ... ... ... ... ...
9652 17 5 294 1 48 75 1782736108 72593.296 500.512
9653 17 5 295 1 48 62 1782763460 72620.648 27.352
9654 17 5 296 1 48 77 1783236524 73093.712 473.064
9655 17 5 297 1 48 64 1783262904 73120.092 26.380
9656 17 5 298 1 48 72 1783750856 73608.044 487.952
9657 17 5 299 1 48 55 1783777084 73634.272 26.228
9658 17 5 300 1 48 70 1784250972 74108.160 473.888
9659 17 5 301 1 48 58 1784270844 74128.032 19.872
9660 17 5 302 1 48 65 1784766872 74624.060 496.028
9661 17 5 303 1 48 54 1784792432 74649.620 25.560
9662 17 5 304 1 48 69 1785263992 75121.180 471.560
9663 17 5 305 1 48 57 1785291876 75149.064 27.884
9664 17 5 306 1 48 72 1785784980 75642.168 493.104
9665 17 5 307 1 48 59 1785810392 75667.580 25.412
9666 17 5 308 1 48 69 1786281656 76138.844 471.264
9667 17 5 309 1 48 53 1786307440 76164.628 25.784
9668 17 5 310 1 48 65 1786797260 76654.448 489.820
9669 17 5 311 1 48 49 1786825580 76682.768 28.320
9670 17 5 312 1 48 70 1787330664 77187.852 505.084
9671 17 5 313 1 48 54 1787357344 77214.532 26.680
9672 17 5 314 1 48 69 1787852992 77710.180 495.648
9673 17 5 315 1 48 56 1787883188 77740.376 30.196
9674 17 5 316 1 48 66 1788365152 78222.340 481.964
9675 17 5 317 1 48 57 1788394148 78251.336 28.996
9676 17 5 318 1 48 70 1788869604 78726.792 475.456
9677 17 5 319 1 48 58 1788900252 78757.440 30.648
9678 17 5 320 1 48 72 1789369428 79226.616 469.176
9679 17 5 321 1 48 59 1789397380 79254.568 27.952
9680 17 5 322 1 48 70 1789879792 79736.980 482.412
9681 17 5 323 1 48 58 1789906620 79763.808 26.828

211 rows × 9 columns


In [39]:


In [45]:
taps = db_isip5.xs('tap_r', level='stamp_type').ints
pstdev = taps.groupby(level='pid').std()
data = pstdev


plt.figure(figsize=(13,6))
data.hist(bins=25)

#annotating non-midpoint values
caption_y_increment = 1
median = data.median()
prev_ypos = 0
for idx, value in enumerate(data):
    if np.abs(value - median) > 8:
        caption = str(data.index[idx]) + ": " + str(value.round(1))
        plt.annotate(caption, (value, prev_ypos + caption_y_increment))
        prev_ypos += caption_y_increment
plt.show()



In [93]:
lt = [1, 2, 3, 4, 5, 7]
r = [2, 3, 4]

set(lt).difference(set(r))

Ordered


Out[93]:
{1, 5, 7}

Collect information about missing or incomplete task data


In [14]:
import matplotlib.pylab as plt
%matplotlib inline
pd.options.display.mpl_style = 'default'

In [14]:
#EXPECTED_RTAP_MINS = {}

In [15]:
dfo = pd.DataFrame(index = pid_list)
dfo['missing_tasks'] = ''

for t in TASK_NAMES_USING:
    df = dbase.xs(t, drop_level=True) #index: pid, stamp_type, csv_line
    for p in pid_list:
        dfp = df.xs(p, drop_level=True)
        if dfp.micros.count()==0: dfo.loc[p] += t + ' '

dfo.loc[dfo.missing_tasks != '']


Out[15]:
missing_tasks
010 Improv_Melody
011 Ticks_Linear_8 Improv_Melody
012 Jits_Linear_5 Jits_Linear_8 Improv_Melody
013 T1_SMS_5 T1_SMS_8 Ticks_ISO_T2_5 Ticks_ISO_T2_...
014 Improv_Melody
015 Improv_Melody
018 Jits_Linear_5
031 T1_SMS_5 T1_SMS_8 Ticks_ISO_T2_5 Ticks_ISO_T2_...

In [16]:
# Number of loopback midi signals per participant per task
# Slightly variations for improv tasks: possibly due to 
# overflows (checkable in original CSV files)?

# T1_SMS_5 and T1_SMS_8 tasks were slightly longer for 
# participants 010 through 015. 
#(sms5: 150 vs 130; sms8: 140 vs 120)
# Should discard the additional intervals for these p's.

loopct = pd.DataFrame(index = pid_list)

for t in TASK_NAMES_USING:
    loopct['lbcount_' + t] = np.nan
    dft = dbase.xs(t, drop_level=True) #index: pid, stamp_type, csv_line
    for p in pid_list:
        dftp = dft.xs(p, drop_level=True)
        task_lbcount = dftp.xs('loopback').micros.count()
        loopct['lbcount_' + t].loc[p] = task_lbcount

loopct[::25]
#loopct.to_csv(....)


Out[16]:
lbcount_T1_SMS_5 lbcount_T1_SMS_8 lbcount_Ticks_ISO_T2_5 lbcount_Ticks_ISO_T2_8 lbcount_Ticks_Linear_5 lbcount_Ticks_Linear_8 lbcount_Ticks_Phase_5 lbcount_Ticks_Phase_8 lbcount_Jits_ISO_5 lbcount_Jits_ISO_8 lbcount_Jits_Phase_5 lbcount_Jits_Phase_8 lbcount_Jits_Linear_5 lbcount_Jits_Linear_8 lbcount_ISIP_5 lbcount_ISIP_8 lbcount_Improv_Metronome lbcount_Improv_Melody
010 150 140 130 120 170 170 170 170 360 360 510 510 510 510 40 30 140 0
035 130 120 130 120 170 170 170 170 360 360 510 510 510 510 40 30 140 162
060 130 120 130 120 170 170 170 170 360 360 510 510 510 510 40 30 140 162
085 130 120 130 120 170 170 170 170 360 360 510 510 510 510 40 30 140 162
110 130 120 130 120 170 170 170 170 360 360 510 510 510 510 40 30 140 162

In [17]:
#lbdata = dfo.drop('missing_tasks', axis=1)
data = loopct
for c in data.columns:
    print(c)
    if 'ISIP' in c:
        print('skipping ISIP data\n')
        continue
    data_range = data[c].max() - data[c].min()
    if data_range==0:
        print("all data points = {0} \n\n".format(data[c].max()))
    else:
        plt.figure(figsize=(13,3))
        data[c].hist(bins=data_range)
        
        #annotating non-midpoint values
        median = data[c].median()
        prev_ypos = 0
        for idx, value in enumerate(data[c]):
            if value != median:
                caption = str(data[c].index[idx]) + ": " + str(value)
                plt.annotate(caption, (value, prev_ypos + 10))
                prev_ypos += 10
        #for i, txt in enumerate(n):
        #    ax.annotate(txt, (x[i],y[i]))
        plt.show()


lbcount_T1_SMS_5
lbcount_T1_SMS_8
lbcount_Ticks_ISO_T2_5
lbcount_Ticks_ISO_T2_8
lbcount_Ticks_Linear_5
lbcount_Ticks_Linear_8
lbcount_Ticks_Phase_5
lbcount_Ticks_Phase_8
lbcount_Jits_ISO_5
lbcount_Jits_ISO_8
lbcount_Jits_Phase_5
lbcount_Jits_Phase_8
lbcount_Jits_Linear_5
lbcount_Jits_Linear_8
lbcount_ISIP_5
skipping ISIP data

lbcount_ISIP_8
skipping ISIP data

lbcount_Improv_Metronome
lbcount_Improv_Melody

In [18]:
tapct = pd.DataFrame(index = pid_list)

for t in TASK_NAMES_USING:
    tapct[t] = np.nan
    dft = dbase.xs(t, drop_level=True) #index: pid, stamp_type, csv_line
    for p in pid_list:
        dftp = dft.xs(p, drop_level=True)
        task_tapcount = dftp.xs('tap_r').micros.count()
        tapct[t].loc[p] = task_tapcount

In [19]:
data = tapct
for c in data.columns:
    print(c)
    if 'ISIP' in c:
        print('skipping ISIP data\n')
        continue
    data_range = data[c].max() - data[c].min()
    if data_range==0:
        print("all data points = {0} \n\n".format(data[c].max()))
    else:
        plt.figure(figsize=(13,3))
        data[c].hist(bins=40)
        
        #annotating non-midpoint values
        caption_y_increment = 5
        median = data[c].median()
        prev_ypos = 0
        for idx, value in enumerate(data[c]):
            if np.abs(value - median) > 40:
                caption = str(data[c].index[idx]) + ": " + str(value)
                plt.annotate(caption, (value, prev_ypos + caption_y_increment))
                prev_ypos += caption_y_increment
        #for i, txt in enumerate(n):
        #    ax.annotate(txt, (x[i],y[i]))
        plt.show()
        
#isip: Why such a large number of timestamps for some p's?
#ISIP_5: 094: 281, 055: 284, 041: 279, ...
#ISIP_8: 041: 288, 055: 280, 017: 217, ...
#...Right: these aren't filtered yet here!


T1_SMS_5
T1_SMS_8
Ticks_ISO_T2_5
Ticks_ISO_T2_8
Ticks_Linear_5
Ticks_Linear_8
Ticks_Phase_5
Ticks_Phase_8
Jits_ISO_5
Jits_ISO_8
Jits_Phase_5
Jits_Phase_8
Jits_Linear_5
Jits_Linear_8
ISIP_5
skipping ISIP data

ISIP_8
skipping ISIP data

Improv_Metronome
Improv_Melody

Older exploratory work


In [136]:
def custom_histogram(data_series):
    cleanintervals = data_series.dropna()
    millis = cleanintervals.divide(1000)
    millis_array = np.array(millis)
    plt.title("Interval lengths")
    plt.xlabel("Milliseconds")
    plt.ylabel("Frequency")
    plt.hist(millis_array, 
             bins=35, 
             #all defaults below
             range=None, 
             normed=False, 
             weights=None, 
             cumulative=False, 
             bottom=None, 
             histtype='bar', 
             align='mid', 
             orientation='vertical', 
             rwidth=None, 
             log=False, 
             color=None, 
             label=None, 
             stacked=False, 
             hold=None,)
    plt.gcf().set_size_inches(7, 7)
    plt.show()

In [607]:
print("pre-filter length, maximum:")
print(len(taps.interval))  #before filter
print(max(taps.interval)); print()

tapsfilt = taps.copy(deep = True)
tapsfilt.loc[
             tapsfilt['interval'] > 750000, # boolean selector on axis 0
             'interval'                     # index value selector on axis 1
             ] = np.nan

print("post-filter length, maximum:")
print(len(tapsfilt.interval))  #after (includes NaN)
print(max(tapsfilt.interval))

custom_histogram(tapsfilt.interval)


pre-filter length, maximum:
161
892052.0

post-filter length, maximum:
161
531780.0